{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 编程实践一 强化学习问题建模"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 环境类"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "class Environment():\n",
    "    '''环境类。环境需要描述这个世界，响应与其进行交互的个体的行为，并向个体提供交互比较的信息\n",
    "    '''\n",
    "    def __init__(self):\n",
    "        # 个体在环境中所具有的行为和状态空间\n",
    "        self.action_space = None\n",
    "        self.status_space = None\n",
    "        print(\"构建环境完毕\")\n",
    "        pass\n",
    "\n",
    "    def dynamics(self, action):\n",
    "        obs_1, reward_1, is_done = None, None, False\n",
    "        # 确定个体的观测状态、即时奖励、是否交互结束\n",
    "        pass\n",
    "        # 返回给个体的信息 \n",
    "        print(\"环境: 获得个体行为，处理后返回给个体观测和及时奖励，并告知个体是否交互结束\")\n",
    "        return obs_1, reward_1, is_done\n",
    "    \n",
    "    def obs_space(self):\n",
    "        '''\n",
    "        根据自身的状态空间以及交互的个体身份确定该个体所具有的观测空间\n",
    "        '''\n",
    "        # 可默认具有环境的状态空间，此时对该个体来说是完全可观测环境\n",
    "        print(\"环境:开放观测空间给个体\")\n",
    "        obs_space = self.status_space \n",
    "        return obs_space\n",
    "        \n",
    "    def act_space(self):\n",
    "        '''根据环境的行为空间以及交互的个体身份确定该个体所具有的行为空间\n",
    "        '''\n",
    "        print(\"环境:开放行为空间给个体\")\n",
    "        act_space = self.action_space\n",
    "        return act_space\n",
    "        \n",
    "    def reset(self):\n",
    "        '''重新设定环境信息，给个体一个初始观测\n",
    "        '''\n",
    "        print(\"重置环境信息\")\n",
    "        agent_start_obs = None # 初始状态下个体的观测\n",
    "        return agent_start_obs\n",
    "    "
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 个体类"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "class Agent():\n",
    "    '''个体类。\n",
    "    '''\n",
    "    def __init__(self, env = None, name = \"agent1\"):\n",
    "        self.env = env\n",
    "        self.name = name\n",
    "        self.act_space = env.act_space()\n",
    "        self.obs_space = env.obs_space()\n",
    "\n",
    "        self.values = None\n",
    "        self.policy = None\n",
    "        self.memory = None\n",
    "\n",
    "        self.obs_0 = None # t=0时刻个体的观测\n",
    "        print(\"构建个体完毕\")\n",
    "        pass\n",
    "\n",
    "    def __str__(self):\n",
    "        return self.name\n",
    "        pass\n",
    "    \n",
    "    def update_values(self):\n",
    "        print(\"个体：更新观测状态价值\")\n",
    "        # self.values = # Code here\n",
    "        pass\n",
    "    \n",
    "    def update_policy(self):\n",
    "        print(\"个体：更新策略\")\n",
    "        # self.policy = # Code here\n",
    "        pass\n",
    "\n",
    "    def update_model(self):\n",
    "        print(\"个体：更新模型\")\n",
    "        # self.model = # Code here\n",
    "        pass \n",
    "\n",
    "    def update_memory(self,\n",
    "                      obs_0 = None,\n",
    "                      action_0 = None,\n",
    "                      reward_1 = None,\n",
    "                      is_done = None,\n",
    "                      obs_1 = None):\n",
    "        print(\"个体：当前状态转换加入记忆中\")\n",
    "        # self.memory = # Code here\n",
    "        pass\n",
    "\n",
    "    def perform_policy(self, policy = None, obs = None):\n",
    "        # 产生一个行为\n",
    "        if policy is not None:\n",
    "            action = policy(obs)\n",
    "        else:\n",
    "            action = None # 随即产生\n",
    "        print(\"个体：依据策略产生一个行为\")\n",
    "        return action\n",
    "\n",
    "    def model(self, action = None):\n",
    "        print(\"个体：思考行为可能带来的下一时刻的观测、及时奖励及是否交互结束\")\n",
    "        # 思考个体的观测状态、即时奖励、是否交互结束\n",
    "        v_obs_1, v_reward_1, v_is_done = None, None, None\n",
    "        # 依据action确定v_obs_1, v_reward_1, v_is_done\n",
    "        # 返回给个体的虚拟信息 \n",
    "        # 也可以把思考的过程变为记忆的一部分\n",
    "        return v_obs_1, v_reward_1, v_is_done\n",
    "        \n",
    "    def act(self, action_0):\n",
    "        # 调用环境的动力学方法\n",
    "        print(\"个体：执行一个行为\")\n",
    "        obs_1, reward_1, is_done = self.env.dynamics(action_0)\n",
    "        self.update_memory(self.obs_0, action_0, reward_1, is_done, obs_1)\n",
    "        self.obs_0 = obs_1\n",
    "        pass\n",
    "    \n",
    "    \n",
    "    def learning(self):\n",
    "        '''个体的学习过程\n",
    "        '''\n",
    "        self.obs_0 = env.reset()\n",
    "        policy = None # 选定一个策略\n",
    "        end_condition = False # 设定一个终止条件\n",
    "        while( not end_condition):\n",
    "            obs = self.obs_0\n",
    "            act_0 = self.perform_policy(policy, obs)\n",
    "            self.act(act_0)\n",
    "            self.update_policy()\n",
    "            # addtional code here\n",
    "            end_condition = True\n",
    "        pass\n",
    "\n",
    "    def planning(self):\n",
    "        '''个体的规划过程\n",
    "        '''\n",
    "        policy = None # 选定一个策略\n",
    "        end_condition = False # 设定一个终止条件\n",
    "        obs_0 = None # 选定一个观测状态\n",
    "        while(not end_condition):\n",
    "            obs = self.obs_0\n",
    "            v_act_0 = self.perform_policy(policy, obs)\n",
    "            self.model(action = v_act_0)\n",
    "            self.update_policy()  \n",
    "            # addtional code here\n",
    "            end_condition = True\n",
    "        pass"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "构建环境完毕\n",
      "环境:开放行为空间给个体\n",
      "环境:开放观测空间给个体\n",
      "构建个体完毕\n"
     ]
    }
   ],
   "source": [
    "env = Environment()\n",
    "agent = Agent(env = env, name = \"agent_1\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "重置环境信息\n"
     ]
    }
   ],
   "source": [
    "env.reset()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "个体：依据策略产生一个行为\n"
     ]
    }
   ],
   "source": [
    "act = agent.perform_policy(None, agent.obs_0)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "个体：执行一个行为\n",
      "环境: 获得个体行为，处理后返回给个体观测和及时奖励，并告知个体是否交互结束\n",
      "个体：当前状态转换加入记忆中\n"
     ]
    }
   ],
   "source": [
    "agent.act(act)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "重置环境信息\n",
      "个体：依据策略产生一个行为\n",
      "个体：执行一个行为\n",
      "环境: 获得个体行为，处理后返回给个体观测和及时奖励，并告知个体是否交互结束\n",
      "个体：当前状态转换加入记忆中\n",
      "个体：更新策略\n"
     ]
    }
   ],
   "source": [
    "agent.learning()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "个体：依据策略产生一个行为\n",
      "个体：思考行为可能带来的下一时刻的观测、及时奖励及是否交互结束\n",
      "个体：更新策略\n"
     ]
    }
   ],
   "source": [
    "agent.planning()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.5.2"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
